{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Copyright (c) Facebook, Inc. and its affiliates.\n",
    "\"\"\"\n",
    "\n",
    "\"\"\"This file has functions to preprocess the chat from user before\n",
    "querying the dialogue manager\"\"\"\n",
    "import string\n",
    "\n",
    "from spacy.lang.en import English\n",
    "from typing import List\n",
    "\n",
    "tokenizer = English().Defaults.create_tokenizer()\n",
    "\n",
    "\n",
    "def word_tokenize(st) -> str:\n",
    "    chat_with_spaces = insert_spaces(st)\n",
    "    return \" \".join([str(x) for x in tokenizer(chat_with_spaces)])\n",
    "\n",
    "\n",
    "def sentence_split(st):\n",
    "    st = st.replace(\" ?\", \" .\")\n",
    "    st = st.replace(\" !\", \" .\")\n",
    "    st = st.replace(\" ...\", \" .\")\n",
    "    res = [\n",
    "        \" \".join([x for x in sen.lower().split() if x not in string.punctuation])\n",
    "        for sen in st.split(\" .\")\n",
    "    ]\n",
    "    return [x for x in res if x != \"\"]\n",
    "\n",
    "\n",
    "def insert_spaces(chat):\n",
    "    updated_chat = \"\"\n",
    "    for i, c in enumerate(chat):\n",
    "        # [num , (num , {num , ,num , :num\n",
    "        if (\n",
    "            (c in [\"[\", \"(\", \"{\", \",\", \":\", \"x\"])\n",
    "            and (i != len(chat) - 1)\n",
    "            and (chat[i + 1].isdigit())\n",
    "        ):\n",
    "            updated_chat += c + \" \"\n",
    "        # num, , num] , num) , num}, num:\n",
    "        # 4x -> 4 x\n",
    "        elif (\n",
    "            (c.isdigit())\n",
    "            and (i != len(chat) - 1)\n",
    "            and (chat[i + 1] in [\",\", \"]\", \")\", \"}\", \":\", \"x\"])\n",
    "        ):\n",
    "            updated_chat += c + \" \"\n",
    "        else:\n",
    "            updated_chat += c\n",
    "\n",
    "    return updated_chat\n",
    "\n",
    "\n",
    "def preprocess_chat(chat: str) -> List[str]:\n",
    "    # For debug mode, return as is.\n",
    "    if chat == \"_debug_\" or chat.startswith(\"_ttad_\"):\n",
    "        return [chat]\n",
    "\n",
    "    # Tokenize\n",
    "    tokenized_line = word_tokenize(chat)\n",
    "    tokenized_sentences = [sen for sen in sentence_split(tokenized_line)]\n",
    "\n",
    "    return tokenized_sentences\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('real_data/humanbot/raw_data/all_appen_humanbot_data.txt') as f, open('real_data/humanbot/humanbot_all_data_preprocessed.txt', 'w') as f2:\n",
    "    for line in f.readlines():\n",
    "        line = line.strip()\n",
    "        preprocessed_line = preprocess_chat(line)[0]\n",
    "        f2.write(preprocessed_line+\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "prev_text = set()\n",
    "with open('../../minecraft/python/craftassist/text_to_tree_tool/turk_data/new_dance_form_data/first_65/all_combined.txt') as f:\n",
    "    for line in f.readlines():\n",
    "        parts = line.strip().split(\"\\t\")\n",
    "        chat = preprocess_chat(parts[0].strip().lower())[0]\n",
    "        prev_text.add(chat)\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "63\n"
     ]
    }
   ],
   "source": [
    "print(len(prev_text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "83\n"
     ]
    }
   ],
   "source": [
    "all_text = set()\n",
    "with open('../../minecraft/python/craftassist/text_to_tree_tool/turk_data/new_dance_form_data/all_data.txt') as f:\n",
    "    for line in f.readlines():\n",
    "        line = line.strip()\n",
    "        chat = preprocess_chat(line.lower())[0]\n",
    "        all_text.add(chat)\n",
    "print(len(all_text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "bob your head\n",
      "rotate your camera up\n",
      "stare in my eyes for a minute\n",
      "tilt 10 degrees\n",
      "look to the left of the chair\n",
      "do n’t stare at me\n",
      "look in the distance\n",
      "look who is calling on my phone\n",
      "look at my right hand\n",
      "stare in the distance\n",
      "face abhinav\n",
      "aim your camera up a little\n",
      "stick your arm out\n",
      "pan 90 degrees\n",
      "put your head down in shame\n",
      "reach towards the ceiling\n",
      "stare down\n",
      "do n’t look at the sun directly\n",
      "look at yourself in the mirror\n",
      "look at this venn diagram i made\n",
      "20\n"
     ]
    }
   ],
   "source": [
    "new_text = set()\n",
    "for l in all_text:\n",
    "    if l not in prev_text:\n",
    "        new_text.add(l)\n",
    "        print(l)\n",
    "print(len(new_text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "bob your head\n",
      "stare in my eyes for a minute\n",
      "look at yourself in the mirror\n",
      "rotate your camera up\n",
      "stare down\n",
      "do n’t look at the sun directly\n",
      "look in the distance\n",
      "look who is calling on my phone\n",
      "tilt 10 degrees\n",
      "look at my right hand\n",
      "stare in the distance\n",
      "pan 90 degrees\n",
      "look to the left of the chair\n",
      "put your head down in shame\n",
      "look at this venn diagram i made\n",
      "face abhinav\n",
      "aim your camera up a little\n",
      "do n’t stare at me\n",
      "stick your arm out\n",
      "reach towards the ceiling\n"
     ]
    }
   ],
   "source": [
    "with open('../../minecraft/python/craftassist/text_to_tree_tool/turk_data/new_dance_form_data/next_20/tool1/input.txt', 'w') as f:\n",
    "    for line in new_text:\n",
    "        f.write(line + \"\\n\")\n",
    "        print(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
